##' ---
##' title: "2010 - 2012 CCES (2-Wave) Panel Data Analysis for Accounting for Pre-Treatment Exposure in Panel Data, Re-Estimating the Effect of Mass Public Shootings"
##' author: "Todd K. Hartman and Benjamin J. Newman"
##' date: "`r format(Sys.Date())`"
##' output: "github_document"
##' ---

##' Housekeeping
# install.packages("pacman")
# library('pacman')
## Load packages
pacman::p_load(data.table, geosphere, ggthemes,
               RStata, tidyverse, zipcode)

## Notes -- R Markdown doesn't play nicely with equal signs in comments, so they are
## replaced with '--'' or '->' so the file will render properly.
##
## There were 3 errors in our original mass shooting dataset -- the geocoding 
## for mass shooting id - 74 had the longitude and latitude reversed; for 
## id - 102, the wrong sign was used for longitude; for id - 222, the wrong 
## sign for latitude was used. We corrected these errors directly in the database.
##
## Although the codebook indicates that there are records for each respondent’s 
## residential zip code for both waves of the 2010-2012 (2-Wave) CCES Panel Study, 
## we could not find these variables in the dataset. A search of the codebook for 
## the stem ‘zip’ (search conducted: 3 March 2018; codebook version: 4 March, 2016) 
## revealed that there were 3 variables that contained or made reference to this term -
## ‘inputzip_10’, ‘regzip_10’, and ‘regzip_post_10’. The codebook describes 
## ‘inputzip_10’ as the relevant variable in wave 1 of the panel for identifying each 
## respondent’s residential zip code -  “So that we can ask you about the news and 
## events in your area, in what zip code do you currently reside?” Yet, this variable 
## was not present in the 2-wave panel dataset. Instead, we found 6 variables in the 
## dataset that contain the stem ‘zip’: regzip_10 (n -> 789), regzip_post_10 (n =->35), 
## reszip_10 (n -> 3,828), reszip_post_10 (n -> 160), regzip_12_pre (n -> 19,533), 
## regzip_12_post (n -> 19,533). While the 2012 zip codes at the voters’ registered 
## addresses appear to be complete, none of the 2010 zip code variables contain the full 
## sample of respondents.
## 
## We use a similar data processing method as described for the 2010-2014 (3-Wave) Panel Survey. 
## We removed 581 respondents who did not provide gun policy preferences at both waves. 
## As a workaround for the missing 2010 residential zip codes (see above), we adopt a 
## two-pronged strategy to ensure that only respondents who did not move between panel 
## waves were included in the analysis. First, we removed 1,179 respondents whose county 
## FIPS codes did not match in 2010 and 2012. Next, we used respondents’ self-reported 
## length of time at their current street address in years (addrlength_1_12) and months 
## (addrlength_2_12) and removed 667 respondents for whom their length of stay at their 
## current residence was shorter than the amount of time that had passed since the start 
## date of the interview (i.e., they reported that they had lived at their current address 
## on or after interviewing began on November 8th, 2010). In sum, 2,427 respondents, or 
## 12.4% of the sample, were excluded from our analysis, which is consistent with the 
## 10.4% removed from the 3-Wave CCES Panel Survey.

## Import Mass Shootings Data
mps <- read.csv("Mass_Shootings_Data_v1.csv")

## Format date
mps$date <- lubridate::dmy(mps$date)

##' Data wrangling
## Remove mass shootings that occurred after CCES interviews began in 2012
mps.sub <- subset(mps, date < "2012-10-02", 
                  select = c("id", "date", "victims", "lon", "lat"))
rm(mps)

## Import 2010-2012 (2-wave) CCES Panel data
load("CCES_2-Wave_Panel_10-12.rdata")

## [gun10, gun12] Gun control, old coding - 1 -> More Strict, 2 -> Less Strict, 3 -> Kept As They Are
## New coding 3 -> More Strict, 2 -> Same, 1 -> Less Strict
cces$gun10[cces$CC10_320 == 1] <- 3
cces$gun10[cces$CC10_320 == 2] <- 1
cces$gun10[cces$CC10_320 == 3] <- 2

cces$gun12[cces$CC12_320 == 1] <- 3
cces$gun12[cces$CC12_320 == 2] <- 1
cces$gun12[cces$CC12_320 == 3] <- 2

## Remove cases where there is missing data on the DV (at either wave)
completeFun <- function(data, desiredCols) {
    completeVec <- complete.cases(data[ , desiredCols])
    return(data[completeVec, ])
}

cces <- completeFun(cces, c("gun10", "gun12"))

## Check frequencies
prop.table(table(cces$gun10))
prop.table(table(cces$gun12))

## How many Rs did not change their views between panel waves?
table(cces$gun10 == cces$gun12)
prop.table(table(cces$gun10 == cces$gun12))

## How many shifted to gun control (positive scores = more control)?
table(gun10 = cces$gun10, gun12 = cces$gun12)
prop.table(table(gun10 = cces$gun10, gun12 = cces$gun12), 1)

## [guns.d12] Differenced gun control measure 
## Positive scores = more gun control; negative = less gun control
cces$guns.d12 <- cces$gun12 - cces$gun10
table(cces$guns.d12)
prop.table(table(cces$guns.d12))

## Check zip codes -- ZIPS ARE INCOMPLETE FOR 2010!!
## 2010 -- All have many NAs
table(is.na(cces$regzip_10))
table(is.na(cces$regzip_post_10))
table(is.na(cces$reszip_10))
table(is.na(cces$reszip_post_10))

## 2012 -- Looks OK
table(is.na(cces$regzip_12_pre)) 
table(is.na(cces$regzip_12_post)) 

## How many Rs provided the same residence COUNTY (FIPS)?
prop.table(table(cces$countyfips_10 == cces$countyfips_12))

## Workaround step 1 - remove cases where FIPS do not match between waves
cces <- subset(cces, countyfips_10 == countyfips_12)

## Length of time in years at current address (from 2012 wave)
summary(cces$addrlength_1_12)  # Years
summary(cces$addrlength_2_12)  # Months

## Calcualte duration in years at current address
cces$dur.address <- ifelse(is.na(cces$addrlength_2_12), cces$addrlength_1_12,
                           ifelse(is.na(cces$addrlength_1_12), cces$addrlength_2_12,
                           cces$addrlength_1_12 + cces$addrlength_2_12/12))
summary(cces$dur.address)
hist(cces$dur.address)

## Date moved to residence zip code
## Calculated as interview start date - duration at address
date.add <- as.POSIXlt(as.Date(cces$starttime_12))
date.add$year <- date.add$year - cces$dur.add
date.add <- as.Date(date.add)
head(date.add)
table(is.na(as.Date(date.add)))  # How many missing dates?
cces$date.address <- date.add

summary(cces$date.address)

## Workaround step 2 - remove cases where date at current address 
## is after Wave 1 interview began (i.e., date > "2010-11-07")
cces <- subset(cces, date.address <= "2010-11-07")

## [ideo, pid] Add in potential political moderators
cces$ideo <- ifelse(cces$ideo5_10 == 6, NA, cces$ideo5_10)  # [1 -> very liberal; 5 -> very conservative]
cces$pid <- ifelse(cces$pid7_10 == 8, NA, cces$pid7_10)  # [1 -> strong Democrat; 7 -> strong Republican]

## Subset the data
cces.sub <- subset(cces, select = c("caseid", "regzip_12_pre", "date.address",
                              "dur.address", "gun10", "gun12", "guns.d12", 
                              "ideo", "pid"))
colnames(cces.sub) <- c("caseid", "zip", "date.address", "dur.address",
                        "gun10", "gun12", "guns.d12", "ideo", "pid")

## Now add the coordinates to the zip centroids in the CCES data
data(zipcode)
cces.gps <- merge(cces.sub, zipcode, by.x = 'zip', by.y = 'zip')
cces.gps <- subset(cces.gps, select = c("caseid", "longitude", "latitude", "date.address",
                                        "dur.address", "gun10", "gun12", "guns.d12", 
                                        "ideo", "pid"))
colnames(cces.gps) <- c("caseid", "lon_r", "lat_r", "date.address", "dur.address",
                        "gun10", "gun12", "guns.d12", "ideo", "pid")

## Merge the CCES data with mass shootings data
cces.1 <- cces.gps[rep(row.names(cces.gps), nrow(mps.sub)), ]  # Duplicate each row
cces.1 <- cces.1[order(cces.1$caseid), ]
mps.1 <- mps.sub[rep(row.names(mps.sub), nrow(cces.gps)), ]  # Duplicate each row

## Convert the data.frame to a data.table
mps.dt <- setDT(mps.1)
cces.dt <- setDT(cces.1)

rm(cces.gps)
rm(cces.1)
rm(mps.1)

## Combine the CCES and shooting data
cces.dt <- cbind(cces.dt, mps.dt)
rm(mps.dt)

## [mi] Calculate distance in (miles) incorporating curvature of the Earth to ALL 142 events
cces.dt <- setDT(cces.dt)
cces.dt <- cces.dt[ , mi := distGeo(matrix(c(lon_r, lat_r), ncol = 2),   # ellipsoid distance
                                    matrix(c(lon, lat), ncol = 2))/1609]  # convert km to miles

## [mps.panel] Create identifier for events occurring b/w panel waves
cces.dt$mps.panel <- ifelse(cces.dt$date > "2010-11-07" & cces.dt$date < "2012-10-02", 1, 0)

## Descriptives for distances occurring b/w CCES panel waves (n -> 17)
summary(cces.dt$mi[cces.dt$mps.panel == 1])

## Rank the distances for each respondent from nearest to farthest
## for ALL shootings that occurred (n -> 142)
cces.dt <- cces.dt %>%
    group_by(caseid) %>%
    arrange(mi) %>%
    mutate(rank.mi.all = order(mi))

## Now rank the distances for each respondent from nearest to farthest
## for shootings that occurred b/w panel waves (n -> 17) & prior events (n -> 125)
cces.dt <- cces.dt %>%
    group_by(mps.panel, caseid) %>%
    arrange(mi) %>%
    mutate(rank.mi = order(mi))

cces.dt <- cces.dt[with(cces.dt, order(caseid, -mps.panel)), ]  # Sort the data

## Create treatment - within 100 miles of ANY mass shooting between panel waves
cces.dt$treat <- ifelse(cces.dt$mps.panel == 1 & cces.dt$mi <= 100, 1, 0)

## Merge with panel data
cces.sub <- left_join(cces.sub, subset(cces.dt, mps.panel == 1 & rank.mi == 1, 
                                       select = c("caseid", "treat", "victims")), 
                      by = "caseid")

summary(cces.sub$treat)
names(cces.sub)[names(cces.sub) == 'victims'] <- 'treat.vics'  #  Rename victims col

## How many Rs exposed to ANY mps within 100 miles b/w panel waves?
table(cces.sub$treat)

## Of ANY WITHIN 100 miles, how many changed gun attitudes?
summary(cces.sub$guns.d12[(cces.sub$treat == 1)])  # Mean change (plus how many missing)
table(cces.sub$guns.d12[(cces.sub$treat == 1)])  # How did Rs change?

## Of ANY BEYOND 100 miles, how many changed gun attitudes?
summary(cces.sub$guns.d12[(cces.sub$treat == 0)])  # Mean change (plus how many missing)
table(cces.sub$guns.d12[(cces.sub$treat == 0)])  # How did Rs change?

## Identify pre-treatment exposures
## ANY mass shooting b/w 2010-2012 within 100 miles
## AND NOT within 100 miles of other events that occurred prior to panel
yrs <- c("1966-01-01", "2000-01-01", "2005-01-01")  # Date thresholds
yrs.labels <- c("66", "00", "05")
tmp.df <- tmp.name1 <- tmp.name2 <- tmp.name3 <- pre100 <- vector(mode = "list", length(yrs))
for (m in 1:length(yrs)){
    tmp.name1 <- "caseid"
    tmp.name2 <- paste("pre", yrs.labels[[m]], sep = "")  # Define variable name for prior exposure
    tmp.name3 <- paste("treat", yrs.labels[[m]], sep = "")  # Define variable name for prior exposure
    tmp.df[[m]] <- subset(cces.dt, select = c("caseid", "date", "mi", "treat"))  # Temporary dataframe
    tmp.df[[m]]$pre100 <- ifelse(cces.dt$mi <= 100 & 
                                     (cces.dt$date >= yrs[[m]] & cces.dt$date < "2010-10-01"),
                                 1, 0)  # Exposed during these dates
    pre100[[m]] <- aggregate(pre100 ~ caseid, data = tmp.df[[m]], sum)  # Count prior exposures
    colnames(pre100[[m]]) <- c(tmp.name1, tmp.name2)  # Rename variables before merging
    cces.sub <- left_join(cces.sub, pre100[[m]], by = "caseid")  # Merge with CCES
    cces.sub[[tmp.name3]] <- ifelse(cces.sub[[tmp.name2]] == 0 & cces.sub$treat == 1, 1, 0)  # No prior exp.
}

## Create Nearest - NEAREST mass shooting occurred within 100
cces.dt$nearest <- ifelse(cces.dt$rank.mi.all == 1 & cces.dt$mi <= 100, 1, 0)

## Create treatment - NEAREST mass shooting occurred within 100 BETWEEN panel waves
cces.dt$treatnear <- ifelse(cces.dt$mps.panel == 1 & cces.dt$nearest, 1, 0)

## Merge with panel data
cces.sub <- left_join(cces.sub, subset(cces.dt, rank.mi.all == 1, 
                                       select = c("caseid", "treatnear", "victims")), 
                      by = "caseid")

summary(cces.sub$treatnear)
names(cces.sub)[names(cces.sub) == 'victims'] <- 'treatnearvics'  #  Rename victims col

## How many Rs exposed to NEAREST mps within 100 miles b/w panel waves?
table(cces.sub$treatnear)

## Reshape the data from wide to long to account for panel structure of the data
df <- subset(cces.sub, select = c("caseid", "gun10", "gun12", "ideo", "pid", 
                                  "pre00", "treat00", "pre05", "treat05", 
                                  "treat", "treatnear"))

df <- df %>%
    gather(year, gun, -c("caseid", "ideo", "pid", 
                         "pre00", "treat00", "pre05", "treat05", 
                         "treat", "treatnear"))
df$year <- ifelse(df$year == "gun10", "2010", "2012")

# Write the data
write.csv(df, file = "df_2-wave.csv", row.names = FALSE)
write_dta(df, "df_2-wave.dta", version = 14)

##' Data analysis
## RStata to call panel methods directly within STATA (on Mac OSX models)
options("RStata.StataPath" = 
            '/Applications/Stata/StataSE.app/Contents/MacOS/stata-se')  # On Mac OSX, change path for Windows
options("RStata.StataVersion" = 15)
stata("set more off, permanently")

stata_code <- "
** Convert strings to numeric
destring caseid - gun, force replace 

** Set as panel data
xtset caseid year

** Difference-in-difference estimators (random effects ordered logit)
xtologit gun i.year##i.treatnear, vce(cluster caseid) level(90)
xtologit gun i.year##i.treatnear, vce(cluster caseid) level(90) or

xtologit gun i.year##i.treat, vce(cluster caseid) level(90)
xtologit gun i.year##i.treat, vce(cluster caseid) level(90) or

xtologit gun i.year##i.treat05, vce(cluster caseid) level(90)
xtologit gun i.year##i.treat05, vce(cluster caseid) level(90) or

xtologit gun i.year##i.treat00, vce(cluster caseid) level(90)
xtologit gun i.year##i.treat00, vce(cluster caseid) level(90) or

** Blow-Up and Cluster Fixed Effects Model
** https://www.statalist.org/forums/forum/general-stata-discussion/general/485879-coding-a-fixed-effects-estimator-where-response-is-an-ordinal-variable-blow-up-and-cluster
capture program drop bucologit
program bucologit
version 11.2
syntax varlist [if] [in], Id(varname)

preserve

marksample touse
markout `touse' `id'

gettoken yraw x : varlist
tempvar y
qui egen int `y' = group(`yraw')

qui keep `y' `x' `id' `touse'
qui keep if `touse'

qui sum `y'
local ymax = r(max)
forvalues i = 2(1)`ymax' {
qui gen byte `yraw'`i' = `y' >= `i'
}
drop `y'

tempvar n cut newid
qui gen long `n' = _n
qui reshape long `yraw', i(`n') j(`cut')
qui egen long `newid' = group(`id' `cut')
sort `newid'
clogit `yraw' `x', group(`newid') cluster(`id') level(90)

restore            
end

** Fixed effects ordered logit models
gen treatnear_buc = treatnear
replace treatnear_buc = 0 if year == 2010

bucologit gun year treatnear_buc, id(caseid)
di exp(.0031363)
di exp(.2163191)
di exp(.4295018)

gen treat_buc = treat
replace treat_buc = 0 if year == 2010
bucologit gun year treat_buc, id(caseid)
di exp(-.0973409)
di exp(.0231937)
di exp(.1437283)

gen treat05_buc = treat05
replace treat05_buc = 0 if year == 2010
bucologit gun year treat05_buc, id(caseid)
di exp(.0881456)
di exp(.2908405)
di exp(.4935353)

gen treat00_buc = treat00
replace treat00_buc = 0 if year == 2010
bucologit gun year treat00_buc, id(caseid)
di exp(-.0125981)
di exp(.2153951)
di exp(.4433882)
"
stata(stata_code, data.in = df, data.out = TRUE)


## Create figures for publication
## Function to correct x-axis labels for nice plotting 
new_lines_adder = function(x, interval) {
    ## Add spaces after /
    x = str_replace_all(x, "/", "/ ")
    ## Split at spaces
    x.split = strsplit(x, " ")[[1]]
    ## Get length of snippets, add one for space
    lens <- nchar(x.split) + 1
    ## split the text into lines
    lines <- cumsum(lens) %/% (interval + 1)
    ## Construct the lines
    x.lines <- tapply(x.split, lines, function(line)
        paste0(paste(line, collapse=" "), "\n"), simplify = TRUE)
    ## Put everything into a single string
    result <- paste(x.lines, collapse="")
    ## Remove spaces we added after /
    result = str_replace_all(result, "/ ", "/")
    return(result)
}

## Function to create a wrapper for the above
add_newlines = function(x, total.length = 85) {
    ## Make sure x is a character array   
    x = as.character(x)
    ## Determine number of groups
    groups = length(x)
    ## Apply splitter to each
    t = sapply(x, FUN = new_lines_adder, 
               interval = round(total.length/groups), USE.NAMES=FALSE)
    return(t)
}

## Figure 2 random effects DiD odds ratios
fig2_stata <- data.frame(Plot = 1:8,
                        Data = c("3-Wave Panel (N = 8,513)", "2-Wave Panel (N = 17,106)", 
                                 "3-Wave Panel (N = 8,513)", "2-Wave Panel (N = 17,106)", 
                                 "3-Wave Panel (N = 8,513)", "2-Wave Panel (N = 17,106)", 
                                 "3-Wave Panel (N = 8,513)", "2-Wave Panel (N = 17,106)"),
                        Model = c("Nearest Event During Panel", "", 
                                  "Any Exposure During Panel", "", 
                                  "Any Exposure (Excluding Priors Since 2005)", "",
                                  "Any Exposure (Excluding Priors Since 2000)", ""),
                        Value = c(1.288918, 1.222363, 
                                  .9656595, 1.01763, 
                                  1.272811, 1.229271, 
                                  1.050475, 1.152851),
                        LB = c(1.041433, 1.013297, 
                               .8253111, .9146616, 
                               1.011516, 1.04086, 
                               .8253561, .9575074),
                        UB = c(1.595215, 1.474564, 
                               1.129875, 1.132191, 
                               1.601603, 1.451786, 
                               1.336996, 1.388048))

dev.new()
postscript("odds_ratio_figure_2_STATA.eps")

ggplot(fig2_stata, aes(x = Plot, y = Value, ymin = LB, ymax = UB, fill = Data)) + 
    geom_pointrange(aes(), 
                    position = position_dodge(width = 0.30)) + 
    geom_point(aes(), size = 4, shape = 21) +
    scale_fill_manual(values = alpha(c("white", "black")), 
                      guide = guide_legend(reverse = TRUE)) +
    ylab("Odds Ratio (with 90% CI)") +
    ylim(.75, 1.75) + 
    scale_x_discrete(limits = c(1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5), 
                     labels = add_newlines(fig2_stata$Model, 120), name = "") +
    geom_hline(aes(yintercept = 1)) + 
    theme(axis.text.x = element_text(face = "bold", size = 12), 
          axis.text.y = element_text(face = "bold", size = 12),
          axis.title.y = element_text(face = "bold", size = 12),
          legend.title = element_text(face = "bold", size = 12),
          legend.text = element_text(face = "bold", size = 10),
          plot.margin = unit(c(1, 2, 1, 1), "cm")) +
    xlab("")

dev.off()

## Figure 3 fixed effects odds ratios
fig3_stata <- data.frame(Plot = 1:8,
                         Data = c("3-Wave Panel (N = 1,878)", "2-Wave Panel (N = 4,029)", 
                                  "3-Wave Panel (N = 1,878)", "2-Wave Panel (N = 4,029)", 
                                  "3-Wave Panel (N = 1,878)", "2-Wave Panel (N = 4,029)", 
                                  "3-Wave Panel (N = 1,878)", "2-Wave Panel (N = 4,029)"),
                         Model = c("Nearest Event During Panel", "", 
                                   "Any Exposure During Panel", "", 
                                   "Any Exposure (Excluding Priors Since 2005)", "",
                                   "Any Exposure (Excluding Priors Since 2000)", ""),
                         Value = c(1.3197405, 1.2414985, 
                                   .97427312, 1.0234648, 
                                   1.4068108, 1.3375512, 
                                   1.1740017, 1.2403519),
                         LB = c(1.0344206, 1.0031412, 
                                .81624363, .90724667, 
                                1.066025, 1.0921471, 
                                .86357364, .98748092),
                         UB = c(1.6837587, 1.5364919, 
                                1.1628982, 1.1545704, 
                                1.8565386, 1.6380972, 
                                1.5960192, 1.557977))

dev.new()
postscript("odds_ratio_figure_3_STATA.eps")

ggplot(fig3_stata, aes(x = Plot, y = Value, ymin = LB, ymax = UB, fill = Data)) + 
    geom_pointrange(aes(), 
                    position = position_dodge(width = 0.30)) + 
    geom_point(aes(), size = 4, shape = 21) +
    scale_fill_manual(values = alpha(c("white", "black")), 
                      guide = guide_legend(reverse = TRUE)) +
    ylab("Odds Ratio (with 90% CI)") +
    scale_y_continuous(breaks = c(.75, 1, 1.25, 1.5, 1.75, 2)) +
    scale_x_discrete(limits = c(1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5), 
                     labels = add_newlines(fig3_stata$Model, 120), name = "") +
    geom_hline(aes(yintercept = 1)) + 
    theme(axis.text.x = element_text(face = "bold", size = 12), 
          axis.text.y = element_text(face = "bold", size = 12),
          axis.title.y = element_text(face = "bold", size = 12),
          legend.title = element_text(face = "bold", size = 12),
          legend.text = element_text(face = "bold", size = 10),
          plot.margin = unit(c(1, 2, 1, 1), "cm")) +
    xlab("")

dev.off()

